require(ltm)
require(readr)
require(survey)

attribute.wording <- list(c("Political party their parents supported", 
                            "Participation in Democratic party activities", 
                            "When someone criticizes the Democratic party ...", 
                            "Opinions on recent political issues", 
                            "Voting habits in previous federal elections", 
                            "Vote intention for next presidential election"), 
                          c("Political party his/her parents supported", 
                            "Participation in Republican party activities", 
                            "When someone criticizes the Republican party ...", 
                            "Opinions on recent political issues", 
                            "Voting habits in previous federal elections", 
                            "Vote intention for next presidential election"))
level.wording <- list(list(c("the Democratic Party", 
                             "neither the Democratic Party nor the Republican Party", 
                             "the Republican Party"), 
                           c("very often", "often", "sometimes", "rarely", "never"), 
                           c("it feels like a personal insult to the person", 
                             "it doesn't matter to the person"), 
                           c("completely approves of the Democratic party's position", 
                             "partially approves of the Democratic party's position", 
                             "doesn't approve of the Democratic party's position"), 
                           c("always voted for the Democratic candidates", 
                             "sometimes voted for the Democratic candidates and voted for the Republican candidates at other times", 
                             "sometimes voted for the Democratic candidates and abstained at other times", 
                             "always abstained"), 
                           c("Democratic candidate", "Republican candidate", "abstain")), 
                      list(c("the Republican Party", 
                             "neither the Democratic Party nor the Republican Party", 
                             "the Democratic Party"), 
                           c("very often", "often", "sometimes", "rarely", "never"), 
                           c("it feels like a personal insult to the person", 
                             "it doesn't matter to the person"), 
                           c("completely approves of the Republican party's position", 
                             "partially approves of the Republican party's position", 
                             "doesn't approve of the Republican party's position"), 
                           c("always voted for the Republican candidates", 
                             "sometimes voted for the Republican candidates and voted for the Democratic candidates at other times", 
                             "sometimes voted for the Republican candidates and abstained at other times", 
                             "always abstained"), 
                           c("Republican candidate", "Democratic candidate", "abstain")))

#### individual-level data ####
raw.data <- read.csv("raw_data.csv")
N <- nrow(raw.data)

# distribution of race (Online Appendix Footnote 4)
round(prop.table(table(raw.data$Q21)), 3)

ID <- 1:N
gender <- raw.data$Q1 - 1
age <- raw.data$Q2 - 1
education <- ifelse(raw.data$Q19 > 5, NA, raw.data$Q19)
black <- ifelse(raw.data$Q21 == 8, NA, (raw.data$Q21 == 2) * 1)
other.race <- ifelse(raw.data$Q21 == 8, NA, (raw.data$Q21 > 2) * 1)
# region of states (1: Northeast, 2: Midwest, 3: South, 4: West)
states.list <- c(3, 4, 4, 3, 4, 4, 1, 3, 3, 3, 3, 4, 4, 2, 2, 2, 2, 3, 3, 1, 
                 3, 1, 2, 2, 3, 2, 4, 2, 4, 1, 1, 4, 1, 3, 2, 2, 3, 4, 1, 1, 
                 3, 2, 3, 3, 4, 1, 3, 4, 3, 2, 4)
region <- states.list[raw.data$Q3]
PID <- ifelse(raw.data$Q4 == 1, 1, 
              ifelse(raw.data$Q4 == 2, 2, 3))

# correct answers for directed questions
directed <- raw.data$Q5_4 == 1 & raw.data$Q5_8 == 4

# correct answers for an instructional manipulation check (IMC)
IMC.data <- raw.data[, 201:208]
IMC.data[is.na(IMC.data)] <- 0
IMC <- IMC.data$Q18_1 == 0 & IMC.data$Q18_2 == 0 & 
  IMC.data$Q18_3 == 1 & IMC.data$Q18_4 == 0 & 
  IMC.data$Q18_5 == 0 & IMC.data$Q18_6 == 0 & 
  IMC.data$Q18_7 == 0 & IMC.data$Q18_8 == 1
# pass rate (Online Appendix C)
round(sum(IMC == 0) / N, 3)

# response time in conjoint tasks
time.spent <- cbind(raw.data$T9.D_3, raw.data$T10.D_3, 
                    raw.data$T11.D_3, raw.data$T12.D_3, 
                    raw.data$T9.R_3, raw.data$T10.R_3, 
                    raw.data$T11.R_3, raw.data$T12.R_3)
time.spent[time.spent <= 5] <- NA
time.dummy <- apply(time.spent, 1, function (x) sum(is.na(x)) < 8)

# individual-level data of attentive respondents
# (we did not use the IMC; see Online Appendix C)
attentive.data <- subset(cbind(ID, raw.data, gender, age, education, 
                               black, other.race, region, PID), 
                         directed == TRUE & time.dummy == TRUE)

# number of respondents with missing values for their education or race
# (Online Appendix Footnote 5)
sum(is.na(attentive.data$education) | is.na(attentive.data$black))
round(sum(is.na(attentive.data$education) | is.na(attentive.data$black)) / 
        nrow(attentive.data), 3)

# political knowledge based on the item response theory (IRT) model
House <- ifelse(attentive.data$Q7 == 2, 1, 0)
Senate <-ifelse(attentive.data$Q8 == 2, 1, 0)
Ryan <- ifelse(attentive.data$Q9 == 4, 1, 0)
Biden <- ifelse(attentive.data$Q10 == 1, 1, 0)
president <- ifelse(attentive.data$Q11 == 2, 1, 0)
senator <- ifelse(attentive.data$Q12 == 6, 1, 0)
override <- ifelse(attentive.data$Q13 == 2, 1, 0)
quiz.data <- data.frame(House, Senate, Ryan, Biden, 
                        president, senator, override)
IRT.result <- ltm(quiz.data ~ z1)
IRT.score.result <- factor.scores(IRT.result, resp.patterns = quiz.data)
knowledge <- IRT.score.result$score.dat$z1
# create a variable vector including inattentive respondents
knowledge.all <- rep(NA, N)
knowledge.all[attentive.data$ID] <- knowledge

# ideology based on the IRT model
issue.attitudes <- attentive.data[, c(101:103, 105:107, 109:111)]
GRM.result <- grm(issue.attitudes)
GRM.score.result <- factor.scores(GRM.result, resp.patterns = issue.attitudes)
ideology <- GRM.score.result$score.dat$z1
# create a variable vector including inattentive respondents
ideology.all <- rep(NA, N)
ideology.all[attentive.data$ID] <- ideology

attentive.data <- data.frame(attentive.data, knowledge, ideology)

save(attentive.data, file = "attentive_data.Rdata")

#### profile-level data ####
# experimental condition (1: Democratic condition, 2: Republican condition)
condition <- ifelse(raw.data$Q4 == 1 | 
                      (raw.data$Q4 > 2 & raw.data$condition == 0), 1, 2)

# record the place of each attribute in conjoint tables for each respondent
attribute <- matrix(NA, N, 6)
for (i in 1:N) {
  count <- 1
  for (j in seq(2, 12, 2)) {
    attribute[i, count] <- match(raw.data[i, j], 
                                 attribute.wording[[condition[i]]])
    count <- count + 1
  }
}

# record the attribute levels of eight hypothetical profiles for each respondent
position <- array(NA, c(N, 6, 8))
for (i in 1:N) {
  count <- 1
  for (j in seq(3, 13, 2)) {
    position[i, count, 1] <- match(raw.data[i, j], 
                                   level.wording[[condition[i]]][[attribute[i, count]]])
    count <- count + 1
  }
  count <- 1
  for (j in 14:19) {
    position[i, count, 2] <- match(raw.data[i, j], 
                                   level.wording[[condition[i]]][[attribute[i, count]]])
    count <- count + 1
  }
  count <- 1
  for (j in seq(21, 31, 2)) {
    position[i, count, 3] <- match(raw.data[i, j], 
                                   level.wording[[condition[i]]][[attribute[i, count]]])
    count <- count + 1
  }
  count <- 1
  for (j in 32:37) {
    position[i, count, 4] <- match(raw.data[i, j], 
                                   level.wording[[condition[i]]][[attribute[i, count]]])
    count <- count + 1
  }
  count <- 1
  for (j in seq(39, 49, 2)) {
    position[i, count, 5] <- match(raw.data[i, j], 
                                   level.wording[[condition[i]]][[attribute[i, count]]])
    count <- count + 1
  }
  count <- 1
  for (j in 50:55) {
    position[i, count, 6] <- match(raw.data[i, j], 
                                   level.wording[[condition[i]]][[attribute[i, count]]])
    count <- count + 1
  }
  count <- 1
  for (j in seq(57, 67, 2)) {
    position[i, count, 7] <- match(raw.data[i, j], 
                                   level.wording[[condition[i]]][[attribute[i, count]]])
    count <- count + 1
  }
  count <- 1
  for (j in 68:73) {
    position[i, count, 8] <- match(raw.data[i, j], 
                                   level.wording[[condition[i]]][[attribute[i, count]]])
    count <- count + 1
  }
}

# arrange the array of respondent * attribute * level by attributes
profile <- array(NA, c(N, 6, 8))
for (i in 1:N) {
  for (j in 1:6) {
    for (k in 1:8) {
      profile[i, j, k] <- position[i, which(attribute[i, ] == j), k]
    }
  }
}

# record each respondent's answers for choice questions
choice <- matrix(NA, N, 8)
for (i in 1:N) {
  if (condition[i] == 1) {
    choice[i, 1] <- 2 - raw.data$Q14.1.D[i]
    choice[i, 2] <- raw.data$Q14.1.D[i] - 1
    choice[i, 3] <- 2 - raw.data$Q15.1.D[i]
    choice[i, 4] <- raw.data$Q15.1.D[i] - 1
    choice[i, 5] <- 2 - raw.data$Q16.1.D[i]
    choice[i, 6] <- raw.data$Q16.1.D[i] - 1
    choice[i, 7] <- 2 - raw.data$Q17.1.D[i]
    choice[i, 8] <- raw.data$Q17.1.D[i] - 1
  }
  else {
    choice[i, 1] <- 2 - raw.data$Q14.1.R[i]
    choice[i, 2] <- raw.data$Q14.1.R[i] - 1
    choice[i, 3] <- 2 - raw.data$Q15.1.R[i]
    choice[i, 4] <- raw.data$Q15.1.R[i] - 1
    choice[i, 5] <- 2 - raw.data$Q16.1.R[i]
    choice[i, 6] <- raw.data$Q16.1.R[i] - 1
    choice[i, 7] <- 2 - raw.data$Q17.1.R[i]
    choice[i, 8] <- raw.data$Q17.1.R[i] - 1
  }
}

# record each respondent's answers for rating questions
# larger numbers indicate "more Democrat" in the Democrat condition
# and "more Republican" in the Republican condition
rating <- matrix(NA, N, 8)
for (i in 1:N) {
  if (condition[i] == 1) {
    rating[i, 1] <- 8 - raw.data$Q14.2.D_1[i]
    rating[i, 2] <- 8 - raw.data$Q14.2.D_2[i]
    rating[i, 3] <- 8 - raw.data$Q15.2.D_1[i]
    rating[i, 4] <- 8 - raw.data$Q14.2.D_2[i]
    rating[i, 5] <- 8 - raw.data$Q16.2.D_1[i]
    rating[i, 6] <- 8 - raw.data$Q16.2.D_2[i]
    rating[i, 7] <- 8 - raw.data$Q17.2.D_1[i]
    rating[i, 8] <- 8 - raw.data$Q17.2.D_2[i]
  }
  else {
    rating[i, 1] <- raw.data$Q14.2.R_1[i]
    rating[i, 2] <- raw.data$Q14.2.R_2[i]
    rating[i, 3] <- raw.data$Q15.2.R_1[i]
    rating[i, 4] <- raw.data$Q14.2.R_2[i]
    rating[i, 5] <- raw.data$Q16.2.R_1[i]
    rating[i, 6] <- raw.data$Q16.2.R_2[i]
    rating[i, 7] <- raw.data$Q17.2.R_1[i]
    rating[i, 8] <- raw.data$Q17.2.R_2[i]
  }
}

# record time spent for each conjoint task
# to drop respondent * profile observations in which
# respondents spent less than five seconds 
time.mat <- matrix(NA, N, 8)
for (i in 1:N) {
  if (condition[i] == 1) {
    time.mat[i, 1] <- time.mat[i, 2] <- raw.data$T9.D_3[i] > 5
    time.mat[i, 3] <- time.mat[i, 4] <- raw.data$T10.D_3[i] > 5
    time.mat[i, 5] <- time.mat[i, 6] <- raw.data$T11.D_3[i] > 5
    time.mat[i, 7] <- time.mat[i, 8] <- raw.data$T12.D_3[i] > 5
  }
  else {
    time.mat[i, 1] <- time.mat[i, 2] <- raw.data$T9.R_3[i] > 5
    time.mat[i, 3] <- time.mat[i, 4] <- raw.data$T10.R_3[i] > 5
    time.mat[i, 5] <- time.mat[i, 6] <- raw.data$T11.R_3[i] > 5
    time.mat[i, 7] <- time.mat[i, 8] <- raw.data$T12.R_3[i] > 5
  }
}
time <- as.vector(t(time.mat))

# create dataset whose unit of observation is respondent * profile 
entire.data <- data.frame(respondent.id = rep(1:N, each = 8),  # respondent ID (used to cluster standard errors by respondent)
                          candidate.id = rep(1:8, times = N),  # profile ID
                          candidate.name = factor(rep(c("James", "Richard", "Mary", "Jennifer", 
                                                        "Robert", "Michael", "Patricia", "Elizabeth"), 
                                                      times = N), 
                                                  levels = c("James", "Richard", "Mary", "Jennifer", 
                                                             "Robert", "Michael", "Patricia", "Elizabeth")),  # profile names
                          choice = as.vector(t(choice)), 
                          rating = as.vector(t(rating)), 
                          a.parents = factor(as.vector(t(profile[, 1, ])), 
                                             levels = c(2, 1, 3), 
                                             labels = c("Independent", "a.same", "a.opponent")), 
                          b.identity = factor(as.vector(t(profile[, 3, ])), levels = c(2, 1), 
                                              labels = c("don't matter", "personal insult")), 
                          c.activity = factor(as.vector(t(profile[, 2, ])), levels = c(5:1), 
                                              labels = rev(c("very often", "often", "sometimes", 
                                                             "rarely", "never"))), 
                          d.policy = factor(as.vector(t(profile[, 4, ])), levels = c(3:1), 
                                            labels = c("not approve", "partially", "completely")), 
                          e.habit = factor(as.vector(t(profile[, 5, ])), levels = c(4:1), 
                                           labels = c("always abstained", "sometimes abstained", 
                                                      "sometimes deviated", "always loyal")), 
                          f.intention = factor(as.vector(t(profile[, 6, ])), levels = c(3, 1, 2), 
                                               labels = c("abstain", "f.same", "f.opponent")), 
                          condition = rep(condition - 1, each = 8),  # 0 indicates Democrat condition, 1 Republican condition 
                          gender = rep(gender, each = 8), 
                          age = rep(age, each = 8), 
                          education = rep(education, each = 8), 
                          black = rep(black, each = 8), 
                          other.race = rep(other.race, each = 8), 
                          region = rep(region, each = 8), 
                          knowledge = rep(knowledge.all, each = 8), 
                          PID = rep(PID, each = 8), 
                          ideology= rep(ideology.all, each = 8), 
                          directed = rep(directed, each = 8), 
                          IMC = rep(IMC, each = 8), 
                          time = time)

save(entire.data, file = "entire_data.Rdata")

#### raking weights ####
# load ACS and CPS data
acs <- read_csv("ACS.csv")
cps <- read_csv("CPS.csv")
# create survey object based on the attentive.data
attentive.data$gender.f <- factor(ifelse(attentive.data$gender == 0, "Male", "Female"),
                                  levels = c("Male", "Female"))
attentive.data$age.f <- factor(ifelse(attentive.data$age == 1, "a20",
                                      ifelse(attentive.data$age == 2, "a30",
                                             ifelse(attentive.data$age == 3, "a40",
                                                    ifelse(attentive.data$age == 4, "a50",
                                                           ifelse(attentive.data$age == 5, "a60", "a70"))))),
                               levels = c("a20", "a30", "a40", "a50", "a60", "a70"))
attentive.data$race.f <- factor(ifelse(attentive.data$Q21 == 1, "white",
                                       ifelse(attentive.data$Q21 == 2, "black", "other")),
                                levels = c("white", "black", "other"))
attentive.data$educ.f <- factor(ifelse(attentive.data$education == 1, "less_high",
                                       ifelse(attentive.data$education == 2, "highschool",
                                              ifelse(attentive.data$education == 3, "somecollege", "college"))),
                                levels = c("less_high", "highschool", "somecollege", "college"))
attentive.data$region.f <- factor(ifelse(attentive.data$region == 1, "Northeast",
                                         ifelse(attentive.data$region == 2, "Midwest",
                                                ifelse(attentive.data$region == 3, "South", "West"))),
                                  levels = c("Northeast", "Midwest", "South", "West"))
attentive.data.sub <- subset(attentive.data, subset = ! is.na(education))
attentive.svydata <- svydesign(ids = ~ 1, data = attentive.data.sub)

# create population targets, ACS 2012-2016
pop.acs <- vector("list", length = 5)
pop.acs[[1]] <- data.frame(gender.f = c("Male", "Female"), 
                           Freq = c(sum(acs$AF2AE002), sum(acs$AF2AE026)))
pop.acs[[2]] <- data.frame(age.f = c("a20", "a30", "a40", "a50", "a60", "a70"),
                           Freq = c(sum(dplyr::select(acs, AF2AE007:AF2AE011, AF2AE031:AF2AE035)),
                                    sum(dplyr::select(acs, AF2AE012:AF2AE013, AF2AE036:AF2AE037)),
                                    sum(dplyr::select(acs, AF2AE014:AF2AE015, AF2AE038:AF2AE039)),
                                    sum(dplyr::select(acs, AF2AE016:AF2AE017, AF2AE040:AF2AE041)),
                                    sum(dplyr::select(acs, AF2AE018:AF2AE021, AF2AE042:AF2AE045)),
                                    sum(dplyr::select(acs, AF2AE022:AF2AE025, AF2AE046:AF2AE049))))
pop.acs[[3]] <- data.frame(race.f = c("white", "black", "other"),
                           Freq = c(sum(acs$AF2ME002), sum(acs$AF2ME003),
                                    sum(dplyr::select(acs, AF2ME004:AF2ME008))))
pop.acs[[4]] <- data.frame(educ.f = c("less_high", "highschool", "somecollege", "college"),
                           Freq = c(sum(dplyr::select(acs, AF4OE002:AF4OE016)),
                                    sum(dplyr::select(acs, AF4OE017:AF4OE018)),
                                    sum(dplyr::select(acs, AF4OE019:AF4OE021)),
                                    sum(dplyr::select(acs, AF4OE022:AF4OE025))))
pop.acs[[5]] <- data.frame(region.f = c("Northeast", "Midwest", "South", "West"),
                           Freq = acs$AF2AE001)

# compute the raking weights based on ACS population target
rake.acs <- rake(design = attentive.svydata, 
                 sample.margins = list(~ gender.f, ~ age.f, ~ race.f, 
                                       ~ educ.f, ~ region.f),
                 population.margins = pop.acs,
                 control = list(maxit = 100))

# create population targets, CPS 2016
pop.cps <- vector("list", length = 5)
pop.cps[[1]] <- data.frame(gender.f = c("Male", "Female"), 
                           Freq = c(length(which(cps$VOTED == 2 & cps$SEX == 1)),
                                    length(which(cps$VOTED == 2 & cps$SEX == 2))))
pop.cps[[2]] <- data.frame(age.f = c("a20", "a30", "a40", "a50", "a60", "a70"),
                           Freq = c(length(which(cps$VOTED == 2 & cps$AGE <= 29)),
                                    length(which(cps$VOTED == 2 & cps$AGE >= 30 & cps$AGE <= 39)),
                                    length(which(cps$VOTED == 2 & cps$AGE >= 40 & cps$AGE <= 49)),
                                    length(which(cps$VOTED == 2 & cps$AGE >= 50 & cps$AGE <= 59)),
                                    length(which(cps$VOTED == 2 & cps$AGE >= 60 & cps$AGE <= 69)),
                                    length(which(cps$VOTED == 2 & cps$AGE >= 70))))
pop.cps[[3]] <- data.frame(race.f = c("white", "black", "other"),
                           Freq = c(length(which(cps$VOTED == 2 & cps$RACE == 100)),
                                    length(which(cps$VOTED == 2 & cps$RACE == 200)),
                                    length(which(cps$VOTED == 2 & cps$RACE > 200))))
pop.cps[[4]] <- data.frame(educ.f = c("less_high", "highschool", "somecollege", "college"),
                           Freq = c(length(which(cps$VOTED == 2 & cps$EDUC <= 72)),
                                    length(which(cps$VOTED == 2 & cps$EDUC == 73)),
                                    length(which(cps$VOTED == 2 & cps$EDUC >= 74 & cps$EDUC <= 110)),
                                    length(which(cps$VOTED == 2 & cps$EDUC >= 111))))
pop.cps[[5]] <- data.frame(region.f = c("Northeast", "Midwest", "South", "West"),
                           Freq = c(length(which(cps$VOTED == 2 & cps$REGION <= 12)),
                                    length(which(cps$VOTED == 2 & cps$REGION >= 21 & cps$REGION <= 22)),
                                    length(which(cps$VOTED == 2 & cps$REGION >= 31 & cps$REGION <= 33)),
                                    length(which(cps$VOTED == 2 & cps$REGION >= 41 & cps$REGION <= 42))))

# compute the raking weights based on CPS population target
rake.cps <- rake(design = attentive.svydata, 
                 sample.margins = list(~ gender.f, ~ age.f, ~ race.f, 
                                       ~ educ.f, ~ region.f),
                 population.margins = pop.cps,
                 control = list(maxit = 100))

# create a data.frame containing weights
rake.data <- data.frame(ID = attentive.data.sub$ID,
                        weight.acs = weights(rake.acs),
                        weight.cps = weights(rake.cps))

#### final dataset (only attentive responses) ####
# drop satisficing responses
clean.data <- subset(entire.data, directed == TRUE & time == TRUE)

# merge rake.data to clean.data
clean.data <- merge(clean.data, rake.data, 
                    by.x = "respondent.id", by.y = "ID", all = TRUE)

save(clean.data, file = "clean_data.Rdata")